Data report - Gapminder dataset

Author

Saar Alon-Barkat

Published

March 1, 2024

Last update: 2024-04-30 11:50:30

Code
library(tidyverse)
library(gapminder)
library(gt)
library(gtsummary)
library(ggrepel)  
library(plotly)
library(sjPlot)

This report uses the Gapminder dataset, which includes real-world country-level panel data (1952-2007) for:

The dataset is available from gapminder r package. For more information on gapminder see: https://www.gapminder.org/

Code
gapminder %>% 
  glimpse()
Rows: 1,704
Columns: 6
$ country   <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
$ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
$ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
$ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
$ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
$ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …


The following table summarizes the main variables across the years.

Code
gapminder %>% 
  select(-country) %>% 
  tbl_summary(by = year)
Characteristic 1952, N = 1421 1957, N = 1421 1962, N = 1421 1967, N = 1421 1972, N = 1421 1977, N = 1421 1982, N = 1421 1987, N = 1421 1992, N = 1421 1997, N = 1421 2002, N = 1421 2007, N = 1421
continent
    Africa 52 (37%) 52 (37%) 52 (37%) 52 (37%) 52 (37%) 52 (37%) 52 (37%) 52 (37%) 52 (37%) 52 (37%) 52 (37%) 52 (37%)
    Americas 25 (18%) 25 (18%) 25 (18%) 25 (18%) 25 (18%) 25 (18%) 25 (18%) 25 (18%) 25 (18%) 25 (18%) 25 (18%) 25 (18%)
    Asia 33 (23%) 33 (23%) 33 (23%) 33 (23%) 33 (23%) 33 (23%) 33 (23%) 33 (23%) 33 (23%) 33 (23%) 33 (23%) 33 (23%)
    Europe 30 (21%) 30 (21%) 30 (21%) 30 (21%) 30 (21%) 30 (21%) 30 (21%) 30 (21%) 30 (21%) 30 (21%) 30 (21%) 30 (21%)
    Oceania 2 (1.4%) 2 (1.4%) 2 (1.4%) 2 (1.4%) 2 (1.4%) 2 (1.4%) 2 (1.4%) 2 (1.4%) 2 (1.4%) 2 (1.4%) 2 (1.4%) 2 (1.4%)
lifeExp 45 (39, 60) 48 (41, 63) 51 (43, 65) 54 (46, 67) 57 (49, 69) 60 (50, 70) 62 (53, 71) 66 (55, 72) 68 (56, 73) 69 (56, 74) 71 (56, 75) 72 (57, 76)
pop 3,943,953 (1,452,026, 9,168,198) 4,282,942 (1,568,811, 9,817,598) 4,686,040 (1,784,362, 10,980,084) 5,170,176 (2,034,768, 12,614,585) 5,877,996 (2,351,192, 14,679,200) 6,404,036 (2,759,717, 16,670,227) 7,007,320 (3,006,286, 18,407,325) 7,774,862 (3,194,990, 20,947,542) 8,688,686 (3,605,992, 22,705,382) 9,735,064 (3,770,150, 24,311,370) 10,372,918 (4,173,506, 26,545,556) 10,517,531 (4,508,034, 31,210,042)
gdpPercap 1,969 (865, 3,913) 2,173 (931, 4,876) 2,335 (1,059, 5,709) 2,678 (1,151, 7,076) 3,339 (1,257, 9,509) 3,799 (1,357, 11,204) 4,216 (1,363, 12,348) 4,280 (1,327, 11,994) 4,386 (1,271, 10,684) 4,782 (1,367, 12,023) 5,320 (1,410, 13,360) 6,124 (1,625, 18,009)
1 n (%); Median (IQR)

Life expectancy

Code
gapminder %>% 
      ggplot(aes(x=factor(year), y=lifeExp)) +
  geom_boxplot()+
stat_summary(fun.y = "mean", geom = "point",color="lightblue",size=3)+
  theme_classic() +
  labs(title = "Life expectancy increase over time",
       x = "",
       y = "Country life expectancy",
       caption = "Source:Gapminder") 

Code
gapminder %>% 
  filter(year == 2007) %>% 
      ggplot(aes(x=lifeExp)) +
  geom_histogram(bins=12,fill="lightblue",color="white")+
  theme_classic() +
  guides(color = FALSE) +
  labs(title = "Life expectancy variation in 2007",
       x = "Country life expectancy",
       y = "Frequency",
       caption = "Source:Gapminder") 

Code
gapminder %>% 
      ggplot(aes(x=lifeExp)) +
  geom_histogram(bins=12,fill="lightblue",color="white")+
  theme_classic() +
  facet_wrap(vars(year))+
  guides(color = FALSE) +
  labs(title = "Life expectancy variation over time",
       x = "Country life expectancy",
       y = "Frequency",
       caption = "Source:Gapminder") 

Code
gapminder %>% 
      ggplot(aes(x=continent, y=lifeExp)) +
  geom_boxplot()+
stat_summary(fun.y = "mean", geom = "point",color="lightblue",size=3)+
  theme_classic() +
  labs(title = "Life expectancy across continents in 2007",
       x = "",
       y = "Country life expectancy",
       caption = "Source:Gapminder") 

Code
gapminder %>% 
  mutate(country_israel = ifelse(country=="Israel","Israel","")) %>% 
ggplot(aes(x=year, y=lifeExp, group=country, color = country_israel,size = country_israel)) +
    geom_line(alpha=0.5)+
  scale_colour_manual(name = "", values=c("lightblue","black"))+
    scale_size_manual(name = "", values=c(0.2,2))+
    theme_classic()+  
  labs(title = "Life expectancy increase across countries",
       x = "",
       y = "Country life expectancy",
       caption = "Source:Gapminder") 

Code
gapminder_lifeExp_tab <- gapminder %>%
    group_by(continent, year) %>%
    summarise(lifeExp=median(lifeExp))


gapminder_lifeExp_tab %>%
    ggplot(aes(x=year, y=lifeExp, color=continent)) +
     geom_line(size=1) + 
     geom_point(size=1.5) +
    geom_label_repel(data = gapminder_lifeExp_tab %>% filter(year == 2007), aes(label = continent),
                   nudge_x = 1,
                   nudge_y = 1) +
  theme_classic() +
  guides(color = FALSE) +
  labs(title = "Life expectancy increase across continents",
       x = "",
       y = "Country median life expectancy",
       caption = "Source:Gapminder") 


GDP per capita

Code
gapminder %>% 
      ggplot(aes(x=factor(year), y=gdpPercap)) +
  geom_boxplot()+
stat_summary(fun.y = "mean", geom = "point",color="lightblue",size=3)+
  theme_classic() +
  labs(title = "GPD per capaita increase over time",
       x = "",
       y = "Country GPD per capaita",
       caption = "Source:Gapminder") 

You can see that there’s an outlier

Code
gapminder %>% 
  filter(year == 2007) %>% 
      ggplot(aes(x=gdpPercap)) +
  geom_histogram(bins=12,fill="lightblue",color="white")+
  theme_classic() +
  guides(color = FALSE) +
  labs(title = "Variation in GDP per capita 2007",
       x = "Country GPD per capaita",
       y = "Frequency",
       caption = "Source:Gapminder") 

Code
gapminder %>% 
      ggplot(aes(x=gdpPercap)) +
  geom_histogram(bins=12,fill="lightblue",color="white")+
  theme_classic() +
  facet_wrap(vars(year))+
  xlim(0,60000)+
  guides(color = FALSE) +
  labs(title = "Variation in GDP per capita over time",
       x = "Country GPD per capaita",
       y = "Frequency",
       caption = "Source:Gapminder") 

Code
gapminder %>% 
  filter(year==2007) %>% 
      ggplot(aes(x=continent, y=gdpPercap)) +
  geom_boxplot()+
stat_summary(fun.y = "mean", geom = "point",color="lightblue",size=3)+
  theme_classic() +
  labs(title = "GDP per capita across continents in 2007",
       x = "",
       y = "Country GPD per capaita",
       caption = "Source:Gapminder") 

Code
gapminder %>% 
  mutate(country_israel = ifelse(country=="Israel","Israel","")) %>% 
ggplot(aes(x=year, y=gdpPercap, group=country, color = country_israel,size = country_israel)) +
    geom_line(alpha=0.7)+
  scale_colour_manual(name = "", values=c("lightblue","black"))+
    scale_size_manual(name = "", values=c(0.2,2))+
    theme_classic()+  
  labs(title = "GPD per capaita increase across countries",
       x = "",
       y = "Country GPD per capaita",
       caption = "Source:Gapminder") 

Code
gapminder_gdpPercap_tab <- gapminder %>%
    group_by(continent, year) %>%
    summarise(gdpPercap=median(gdpPercap))


gapminder_gdpPercap_tab %>%
    ggplot(aes(x=year, y=gdpPercap, color=continent)) +
     geom_line(size=1) + 
     geom_point(size=1.5) +
    geom_label_repel(data = gapminder_gdpPercap_tab %>% filter(year == 2007), aes(label = continent),
                   nudge_x = 1,
                   nudge_y = 1) +
  theme_classic() +
  guides(color = FALSE) +
  labs(title = "GPD per capaita increase across continents",
       x = "",
       y = "Country GPD per capaita",
       caption = "Source:Gapminder") 

Code
gapminder_gdpPercap_tab <- gapminder %>%
  filter(country %in% c("Israel","Lebanon","Greece","Turkey","Syria")) %>% 
    group_by(country, year) %>%
    summarise(gdpPercap=median(gdpPercap))


gapminder_gdpPercap_tab %>%
    ggplot(aes(x=year, y=gdpPercap, color=country)) +
     geom_line(size=1) + 
     geom_point(size=1.5) +
    geom_label_repel(data = gapminder_gdpPercap_tab %>% filter(year == 2007), aes(label = country),
                   nudge_x = 1,
                   nudge_y = 1) +
  theme_classic() +
  guides(color = FALSE) +
  labs(title = "GPD per capaita increase across continents",
       x = "",
       y = "Country GPD per capaita",
       caption = "Source:Gapminder")